#Importing all required library
import nltk
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud, STOPWORDS
#Downloading nltk StopWords and Wordnet
nltk.download('stopwords')
nltk.download('wordnet')
from google.colab import drive
drive.mount('/content/gdrive')
d1 = pd.read_csv("/content/gdrive/MyDrive/DataSets/news.csv")
d1.head()
d1["Article"] = d1["title"] + d1["text"]
d1.sample(frac = 1) #Shuffle 100%
d1.label[d1.label == 'REAL'] = 1
d1.label[d1.label == 'FAKE'] = 0
d1 = d1.loc[:,['Article','label']]
d1 = d1.dropna()
d1.head()
def wordpre(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W"," ",text) # remove special chars
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
d1['Article']=d1['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d1[d1.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d1[d1.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
dF2 = pd.read_csv("/content/gdrive/MyDrive/DataSets/Fake.csv")
dT2 = pd.read_csv("/content/gdrive/MyDrive/DataSets/True.csv")
#Counting by Subjects in Real news
for key,count in dT2.subject.value_counts().iteritems():
print(f"{key}:\t{count}")
#Getting Total Rows
print(f"Total Records:\t{dT2.shape[0]}")
#Counting by Subjects in Fake news
for key,count in dF2.subject.value_counts().iteritems():
print(f"{key}:\t{count}")
#Getting Total Rows
print(f"Total Records:\t{dF2.shape[0]}")
#ploting the Subjects in Real news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=dT2)
plt.show()
#ploting the Subjects in Fake news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=dF2)
plt.show()
dT2['label']= 1
dF2['label']= 0
d2 = pd.concat([dT2, dF2])
d2["Article"] = d2["title"] + d2["text"]
d2.sample(frac = 1) #Shuffle 100%
d2 = d2.loc[:,['Article','label']]
d2
## Applying the wordpre method to the dataset
d2['Article']=d2['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d2[d2.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d2[d2.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
dR3 = pd.read_csv("/content/gdrive/MyDrive/DataSets/politifact_real.csv")
dF3 = pd.read_csv("/content/gdrive/MyDrive/DataSets/politifact_fake.csv")
dR3['label']= 1
dF3['label']= 0
df3 = pd.concat([dR3, dF3])
df3["Article"] = df3["title"]
df3.sample(frac = 1) #Shuffle 100%
df3 = df3.loc[:,['Article','label']]
df3
df3['Article']=df3['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df3[df3.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df3[df3.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
df4= pd.read_csv("/content/gdrive/MyDrive/DataSets/train.csv")
df4.head()
df4["Article"] = df4["title"] + df4["text"]
df4.sample(frac = 1) #Shuffle 100%
df4 = df4.loc[:,['Article','label']]
df4 = df4.dropna()
df4['Article']=df4['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df4[df4.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df4[df4.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
d5= pd.read_csv("/content/gdrive/MyDrive/DataSets/data.csv")
d5
d5["Article"] = d5["Headline"] + d5["Body"]
d5["label"] = d5["Label"]
d5.sample(frac = 1) #Shuffle 100%
d5 = d5.loc[:,['Article','label']]
d5 = d5.dropna()
d5['Article']=d5['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d5[d5.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d5[d5.label== 00].Article))
plt.imshow(wc, interpolation="bilinear")
#combining all the datset into one
frames = [d1, d2, df3, df4,d5]
data = pd.concat(frames)
data.shape
data.head()
data.label.value_counts().plot(kind='bar')
plt.title('label')
plt.grid()
plt.show()
print(data.label.value_counts())
!pip install statsmodels
from textblob import TextBlob, Word, Blobber
data['polarity'] = data['Article'].map(lambda text: TextBlob(text).sentiment.polarity)
data['review_len'] = data['Article'].astype(str).apply(len)
data['word_count'] = data['Article'].apply(lambda x: len(str(x).split()))
#Plotting the distribution of the extracted feature
plt.figure(figsize = (20, 5))
plt.style.use('seaborn-white')
plt.subplot(131)
sns.distplot(data['polarity'])
fig = plt.gcf()
plt.subplot(132)
sns.distplot(data['review_len'])
fig = plt.gcf()
plt.subplot(133)
sns.distplot(data['word_count'])
fig = plt.gcf()
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(data[data.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(data[data.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
x_train,x_test,y_train,y_test = train_test_split(data['Article'], data['label'], test_size=0.2, random_state=2021)
x_train.shape
x_test.shape
y_train=y_train.astype('int')
y_test=y_test.astype('int')
#LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', LogisticRegression())])
Logisticmodel = pipe.fit(x_train, y_train)
prediction = Logisticmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Logisticmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(y_test, prediction))
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', DecisionTreeClassifier(criterion= 'entropy',
max_depth = 10,
splitter='best',
random_state=2020))])
DecisionTreemodel = pipe.fit(x_train, y_train)
prediction = DecisionTreemodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
DecisionTreemodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of DecisionTreeClassifier:\n")
print(classification_report(y_test, prediction))
pip install xgboost
from xgboost import XGBClassifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', XGBClassifier(loss = 'deviance',
learning_rate = 0.01,
n_estimators = 10,
max_depth = 5,
random_state=2021))])
xgboostmodel = pipe.fit(x_train, y_train)
prediction = xgboostmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
xgboostmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of XGBoostClassifier:\n")
print(classification_report(y_test, prediction))
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', MultinomialNB())])
MNBCmodel = pipe.fit(x_train, y_train)
prediction = MNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Multinomial_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of Multinomial Naive Bayes Classifier:\n")
print(classification_report(y_test, prediction))
from nltk.corpus import stopwords
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
stop = stopwords.words('english')
data['Article'] = data['Article'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data.head()
news_features=data.copy()
news_features=news_features[['Article']].reset_index(drop=True)
news_features.head()
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
corpus = []
for i in range(0, len(news_features)):
news = re.sub('[^a-zA-Z]', ' ', news_features['Article'][i])
news= news.lower()
news = news.split()
news = [ps.stem(word) for word in news if not word in stop_words]
news = ' '.join(news)
corpus.append(news)
corpus[1]
voc_size=10000
onehot_repr=[one_hot(words,voc_size)for words in corpus]
sent_length=5000
#Padding the sentences
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)
embedded_docs[1]
tfidf_vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(2,2))
# TF-IDF feature matrix
X= tfidf_vectorizer.fit_transform(news_features['Article'])
X.shape
y=data['label']
len(embedded_docs),y.shape
# Converting the X and y as array
X_final=np.asarray(embedded_docs).astype(np.float32)
y_final=np.asarray(y).astype(np.float32)
#Check shape of X and y final
X_final.shape,y_final.shape
#Creating the lstm model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100))) #Adding 100 lstm neurons in the layer
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
#Compiling the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
# Train test split of the X and y final
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)
# Fitting with 10 epochs and 64 batch size
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)
# Predicting from test data
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)
LSTM = round(accuracy_score(y_test, y_pred)*100,2)
print(confusion_matrix(y_test, y_pred))
#Calculating Accuracy score
accuracy_score(y_test,y_pred)
print("\nCLassification Report Long-Short Term Memory:\n")
print(classification_report(y_test, y_pred))
x = [ "Logisticmodel_accuracy", "xgboostmodel_accuracy" ,
"DecisionTreemodel_accuracy","Multinomial_Naive_Bayes_accuracy",
"LSTM"]
y = [Logisticmodel_accuracy,xgboostmodel_accuracy,
DecisionTreemodel_accuracy,Multinomial_Naive_Bayes_accuracy,
LSTM]
plt.barh(x, y)
for index, value in enumerate(y):
plt.text(value, index, str(value))
model.save('LSTM_model')
from mlxtend.evaluate import paired_ttest_5x2cv
# check if difference between algorithms is real
t, p = paired_ttest_5x2cv(estimator1=Logesticmodel,
estimator2=model,
X=X_train,
y=y_train,
scoring='accuracy',
random_seed=1)
# summarize
print(f'The P-value is = {p:.3f}')
print(f'The t-statistics is = {t:.3f}')
# interpret the result
if p <= 0.05:
print('Since p<0.05, We can reject the null-hypothesis that both models perform equally well on this dataset. We may conclude that the two algorithms are significantly different.')
else:
print('Since p>0.05, we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different.')